library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(jsonlite)
library(purrr)
##
## Attaching package: 'purrr'
## The following object is masked from 'package:jsonlite':
##
## flatten
library(stringr)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(modelr)
library(broom)
##
## Attaching package: 'broom'
## The following object is masked from 'package:modelr':
##
## bootstrap
library(tibble)
library(ggplot2)
library(mgcv)
## Loading required package: nlme
##
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
##
## collapse
## This is mgcv 1.8-41. For overview type 'help("mgcv-package")'.
Apro il file JSON, per come è fatto devo accedere al primo elemento e posso quindi caricarlo in un tibble. Sono 14.536 liste.
complete <- read_json("complete.json")
programs <- complete[[1]]
programsT <- tibble(programs)
remove(complete)
remove(programs)
A questo punto ho la possibilità di accedere ai programmi, controllo che siano tutte liste di lunghezza 6.
programsT %>%
mutate(programs_L = sapply(programs,length)) %>%
filter(programs_L != 6)
## # A tibble: 0 × 2
## # … with 2 variables: programs <list>, programs_L <int>
Posso quindi espandere i programmi.
programsT <- programsT %>% unnest_wider(programs)
programsT %>% distinct(programID)
## # A tibble: 14,531 × 1
## programID
## <chr>
## 1 3853
## 2 5178
## 3 10785
## 4 5887
## 5 305
## 6 3368
## 7 4226
## 8 5087
## 9 6310
## 10 1979
## # … with 14,521 more rows
programsT %>% select(programID) %>% n_distinct() # ci sono 14531 programmi
## [1] 14531
Nella maggior parte dei casi a un programma è associato un concerto, ma non è sempre così. In totale ci sono 22525 concerti.
programsT %>%
mutate(concerts_L=sapply(concerts,length)) %>%
count(concerts_L) %>%
mutate(concertN = concerts_L * n) %>%
mutate(concertsT = cumsum(concertN)) %>%
arrange(-concertsT)
## # A tibble: 11 × 4
## concerts_L n concertN concertsT
## <int> <int> <int> <int>
## 1 16 4 64 22525
## 2 12 4 48 22461
## 3 9 7 63 22413
## 4 8 2 16 22350
## 5 7 2 14 22334
## 6 6 5 30 22320
## 7 5 84 420 22290
## 8 4 1062 4248 21870
## 9 3 1033 3099 17622
## 10 2 2190 4380 14523
## 11 1 10143 10143 10143
Espando i concerti, ottengo una riga per ogni concerto. Passo da 14.536 a 22.525 righe. Aggiungo anche una colonna concertID. Passo a 7 colonne.
programsT <- programsT %>% unnest_longer(concerts)
nrow(programsT)
## [1] 22525
programsT <- rowid_to_column(programsT,var = "concertID")
programsT <- select(programsT,id,programID,orchestra,season,concertID,concerts,works)
Verifico che ogni concerto abbia 5 campi.
programsT %>%
mutate(concerts_L = sapply(concerts, length)) %>%
filter(concerts_L != 5)
## # A tibble: 0 × 8
## # … with 8 variables: id <chr>, programID <chr>, orchestra <chr>, season <chr>,
## # concertID <int>, concerts <list>, works <list>, concerts_L <int>
Posso espandere con unnest_wider. Allargo il tibble di 4 colonne, al posto di concerts ho eventType, Location, Venue, Date e Time.
programsT <- programsT %>% unnest_wider(concerts)
Come si può vedere, un programma può contenere più concerti.
programsT %>% group_by(programID) %>% count(sort = TRUE)
## # A tibble: 14,531 × 2
## # Groups: programID [14,531]
## programID n
## <chr> <int>
## 1 10700 16
## 2 10702 16
## 3 3128 16
## 4 3139 16
## 5 10701 12
## 6 10703 12
## 7 3134 12
## 8 3144 12
## 9 14385 9
## 10 14403 9
## # … with 14,521 more rows
Per esempio il programma 10700 contiene 16 concerti. In questo caso vediamo anche che di alcuni concerti non è indicata l’ora. Per questo motivo ho introdotto un concertID, se in seguito mi fossi limitato a identificare il concerto con eventType, Location, Venue, Date, Time, avrei perso alcuni concerti.
programsT %>%
filter(programID == 10700)
## # A tibble: 16 × 11
## id progr…¹ orche…² season conce…³ event…⁴ Locat…⁵ Venue Date Time works
## <chr> <chr> <chr> <chr> <int> <chr> <chr> <chr> <chr> <chr> <list>
## 1 dbe4… 10700 New Yo… 1950-… 8525 Special Manhat… Roxy… 1951… 12:4… <list>
## 2 dbe4… 10700 New Yo… 1950-… 8526 Special Manhat… Roxy… 1951… None <list>
## 3 dbe4… 10700 New Yo… 1950-… 8527 Special Manhat… Roxy… 1951… None <list>
## 4 dbe4… 10700 New Yo… 1950-… 8528 Special Manhat… Roxy… 1951… None <list>
## 5 dbe4… 10700 New Yo… 1950-… 8529 Special Manhat… Roxy… 1951… 12:4… <list>
## 6 dbe4… 10700 New Yo… 1950-… 8530 Special Manhat… Roxy… 1951… None <list>
## 7 dbe4… 10700 New Yo… 1950-… 8531 Special Manhat… Roxy… 1951… None <list>
## 8 dbe4… 10700 New Yo… 1950-… 8532 Special Manhat… Roxy… 1951… None <list>
## 9 dbe4… 10700 New Yo… 1950-… 8533 Special Manhat… Roxy… 1951… 12:4… <list>
## 10 dbe4… 10700 New Yo… 1950-… 8534 Special Manhat… Roxy… 1951… None <list>
## 11 dbe4… 10700 New Yo… 1950-… 8535 Special Manhat… Roxy… 1951… None <list>
## 12 dbe4… 10700 New Yo… 1950-… 8536 Special Manhat… Roxy… 1951… None <list>
## 13 dbe4… 10700 New Yo… 1950-… 8537 Special Manhat… Roxy… 1951… None <list>
## 14 dbe4… 10700 New Yo… 1950-… 8538 Special Manhat… Roxy… 1951… None <list>
## 15 dbe4… 10700 New Yo… 1950-… 8539 Special Manhat… Roxy… 1951… None <list>
## 16 dbe4… 10700 New Yo… 1950-… 8540 Special Manhat… Roxy… 1951… 12:4… <list>
## # … with abbreviated variable names ¹programID, ²orchestra, ³concertID,
## # ⁴eventType, ⁵Location
Ogni programma contiene un certo numero di lavori (works). In totale ci sono 125.224 works (si tratta di works che possono essere ripetuti in diversi programs).
programsT %>%
mutate(works_L=sapply(works,length)) %>%
count(works_L) %>%
mutate(works_N = works_L * n) %>%
mutate(works_T = cumsum(works_N)) %>%
arrange(-works_T)
## # A tibble: 37 × 4
## works_L n works_N works_T
## <int> <int> <int> <int>
## 1 50 13 650 125224
## 2 48 4 192 124574
## 3 46 5 230 124382
## 4 41 1 41 124152
## 5 36 1 36 124111
## 6 32 4 128 124075
## 7 30 7 210 123947
## 8 29 2 58 123737
## 9 28 6 168 123679
## 10 27 7 189 123511
## # … with 27 more rows
Ci sono dei program che hanno 0 works, che cosa sono? Si tratta di eventi che non prevedono musica o dei quali mancano informazioni.
programsT %>%
mutate(works_L=sapply(works,length)) %>%
filter(works_L == 0)
## # A tibble: 29 × 12
## id progr…¹ orche…² season conce…³ event…⁴ Locat…⁵ Venue Date Time works
## <chr> <chr> <chr> <chr> <int> <chr> <chr> <chr> <chr> <chr> <list>
## 1 2a85… 13816 New Yo… 1894-… 577 Subscr… Manhat… Metr… 1895… 8:00… <NULL>
## 2 21fa… 12310 Stadiu… 1934-… 5697 Stadiu… Manhat… Grea… 1935… 8:30… <NULL>
## 3 e76c… 10203 Member… 1943-… 7229 Studen… Manhat… Hunt… 1944… 3:30… <NULL>
## 4 1b6c… 8607 NYP Pr… 1998-… 17636 Chamber Manhat… Alic… 1998… 8:00… <NULL>
## 5 e91b… 13779 Musici… 2007-… 19411 Musici… Manhat… Hull… 2007… 6:30… <NULL>
## 6 7112… 13665 Musici… 2012-… 20558 Chambe… Manhat… Doub… 2012… 8:30… <NULL>
## 7 3b66… 13666 NYP Pr… 2012-… 20700 Chambe… Manhat… St. … 2013… 7:00… <NULL>
## 8 26fe… 13667 Musici… 2012-… 20704 Chambe… Manhat… Racq… 2013… None <NULL>
## 9 98f2… 13749 NYP Pr… 2013-… 20745 Chamber Manhat… The … 2013… 3:00… <NULL>
## 10 3dd4… 13341 NYP Pr… 2013-… 20767 Chambe… Manhat… None 2013… None <NULL>
## # … with 19 more rows, 1 more variable: works_L <int>, and abbreviated variable
## # names ¹programID, ²orchestra, ³concertID, ⁴eventType, ⁵Location
Posso utilizzare unnest_longer, ogni lavoro associato a un programma darà origine a una riga. Devo usare l’opzione keep_empty TRUE per non perdere le righe con zero works. Quindi il tibble passerà a 125.224 + 29 righe, ovvero 125.253.
programsT <- programsT %>% unnest_longer(works, keep_empty = TRUE)
programsT %>% filter(programID == 13665)
## # A tibble: 1 × 11
## id progr…¹ orche…² season conce…³ event…⁴ Locat…⁵ Venue Date Time works
## <chr> <chr> <chr> <chr> <int> <chr> <chr> <chr> <chr> <chr> <list>
## 1 71124… 13665 Musici… 2012-… 20558 Chambe… Manhat… Doub… 2012… 8:30… <NULL>
## # … with abbreviated variable names ¹programID, ²orchestra, ³concertID,
## # ⁴eventType, ⁵Location
Ora works contiene liste di lunghezze diverse, a seconda delle caratteristiche del work compreso.
programsT %>%
mutate(works_L=sapply(works,length)) %>%
count(works_L)
## # A tibble: 5 × 2
## works_L n
## <int> <int>
## 1 0 29
## 2 3 18489
## 3 4 5292
## 4 5 68041
## 5 6 33402
Le liste di lunghezza 3 sono quelle che contengono le intermissions.
programsT %>%
mutate(works_L = sapply(works,length)) %>%
filter(works_L == 3) %>%
select(works) %>%
unnest_wider(works) %>%
count(ID, interval, soloists)
## # A tibble: 4 × 4
## ID interval soloists n
## <chr> <chr> <lgl> <int>
## 1 0* Intermission NA 18394
## 2 1743* Intermission-Third NA 1
## 3 4346* Intermission-Short NA 54
## 4 7955* Intermission-Second NA 40
Cerco di capire cosa contengono le altre liste (di lunghezza 4, 5 e 6).
programsT %>%
mutate(works_L = sapply(works,length)) %>%
filter(works_L == 4) %>%
select(works) %>%
unnest_wider(works)
## # A tibble: 5,292 × 4
## ID composerName workTitle soloists
## <chr> <chr> <list> <list>
## 1 3642* Hummel, Johann <chr [1]> <list [5]>
## 2 52425* Beethoven, Ludwig van <chr [1]> <list [7]>
## 3 3712* Bochsa, Robert N. C. <chr [1]> <list [2]>
## 4 3764* Lindpaintner, Peter Von <chr [1]> <list [5]>
## 5 3712* Bochsa, Robert N. C. <chr [1]> <list [2]>
## 6 3764* Lindpaintner, Peter Von <chr [1]> <list [5]>
## 7 3894* Traditional, <chr [1]> <list [2]>
## 8 3468* Haydn, Franz Joseph <chr [1]> <list [1]>
## 9 3991* Donizetti, Gaetano <chr [1]> <list [2]>
## 10 3999* Hummel, Johann <chr [1]> <list [7]>
## # … with 5,282 more rows
programsT %>%
mutate(works_L = sapply(works,length)) %>%
filter(works_L == 5) %>%
select(works) %>%
unnest_wider(works)
## # A tibble: 68,041 × 6
## ID composerName workTitle conductorName soloi…¹ movem…²
## <chr> <chr> <list> <chr> <list> <list>
## 1 52446* Beethoven, Ludwig van <chr [1]> Hill, Ureli Corel… <NULL> <NULL>
## 2 5543* Kalliwoda, Johann W. <chr [1]> Timm, Henry C. <NULL> <NULL>
## 3 52437* Beethoven, Ludwig van <chr [1]> Hill, Ureli Corel… <NULL> <NULL>
## 4 3659* Romberg, Bernhard <chr [1]> Hill, Ureli Corel… <list> <NULL>
## 5 4567* Hummel, Johann <chr [1]> Hill, Ureli Corel… <list> <NULL>
## 6 5150* Pacini, Giovanni <chr [1]> Not conducted <list> <NULL>
## 7 5161* Romberg, Bernhard <chr [1]> Not conducted <list> <NULL>
## 8 5166* Thalberg, Sigismond <chr [1]> Not conducted <list> <NULL>
## 9 5172* Herz, Henri <chr [1]> Alpers, William <list> <NULL>
## 10 5174* Lindpaintner, Peter Von <chr [1]> Not conducted <list> <NULL>
## # … with 68,031 more rows, and abbreviated variable names ¹soloists, ²movement
programsT %>%
mutate(works_L = sapply(works,length)) %>%
filter(works_L == 6) %>%
select(works) %>%
unnest_wider(works)
## # A tibble: 33,402 × 6
## ID composerName workTitle movem…¹ condu…² soloi…³
## <chr> <chr> <chr> <list> <chr> <list>
## 1 8834*4 Weber, Carl Maria Von OBERON <chr> Timm, … <list>
## 2 8834*3 Weber, Carl Maria Von OBERON <chr> Etienn… <NULL>
## 3 8835*1 Rossini, Gioachino ARMIDA <chr> Timm, … <list>
## 4 8837*6 Beethoven, Ludwig van FIDELIO, OP. 72 <chr> Timm, … <list>
## 5 8336*4 Mozart, Wolfgang Amadeus ABDUCTION FROM TH… <chr> Timm, … <list>
## 6 8838*2 Bellini, Vincenzo I PURITANI <chr> Hill, … <list>
## 7 8839*2 Rossini, Gioachino WILLIAM TELL <chr> Alpers… <NULL>
## 8 53076*2 Rossini, Gioachino STABAT MATER <chr> Alpers… <list>
## 9 51568*2 Hummel, Johann CONCERTO, PIANO, … <chr> Alpers… <list>
## 10 51568*3 Hummel, Johann CONCERTO, PIANO, … <chr> Alpers… <list>
## # … with 33,392 more rows, and abbreviated variable names ¹movement,
## # ²conductorName, ³soloists
Faccio unnest_wider di works. Passo da 11 colonne a 17 (al posto di works si aggiungono tutte le colonne comuni ai diversi works, ovvero ID, composerName, workTitle, movement, conductorName, soloists, interval)
programsT <- programsT %>% unnest_wider(works)
Passo a prendere in considerazione workTitle.
programsT %>%
mutate(workTitle_L = sapply(workTitle,length)) %>%
count(workTitle_L)
## # A tibble: 3 × 2
## workTitle_L n
## <int> <int>
## 1 0 18518
## 2 1 106723
## 3 2 12
Come sono gli elementi di lunghezza 2.
programsT %>%
mutate(workTitle_L = sapply(workTitle, length)) %>%
filter(workTitle_L == 2) %>%
pull(workTitle) %>%
head(3)
## [[1]]
## [[1]]$`_`
## [1] ", PROCESSION OF THE KNIGHTS OF THE HOLY GRAIL"
##
## [[1]]$em
## [1] "PARSIFAL"
##
##
## [[2]]
## [[2]]$`_`
## [1] "CHORUS OF VILLAGERS FROM (ARR. CLARINET ENS.) (ARR. Bellison)"
##
## [[2]]$em
## [1] "PRINCE IGOR"
##
##
## [[3]]
## [[3]]$`_`
## [1] "AIR FROM (ARR. CLARINET ENS.) (ARR. Bellison)"
##
## [[3]]$em
## [1] "SUITE IN D MAJOR"
Mi salvo i program ID per ritrovarli dopo la modifica.
wt2ids <- programsT %>%
mutate(workTitle_L = sapply(workTitle, length)) %>%
filter(workTitle_L == 2) %>%
select(ID)
Applico funzione per sistemarli (sistemo anche NA al posto di NULL), faccio semi_join con gli id per verificare il risultato su quelli di lunghezza 2.
mod_w <- function(x) {
if (is.null(x)) return (NA)
if (length(x) == 1) return (x[[1]])
if (length(x) == 2) return (combine_w(x[[1]], x[[2]]))
}
combine_w <- function(x,y) {
if (startsWith(x,",")) {
return (paste(y,x))
} else {
return (paste(x,y))
}
}
programsT %>%
mutate(workTitle = sapply(workTitle, mod_w)) %>%
semi_join(wt2ids) %>%
select(workTitle)
## Joining with `by = join_by(ID)`
## # A tibble: 12 × 1
## workTitle
## <chr>
## 1 PARSIFAL , PROCESSION OF THE KNIGHTS OF THE HOLY GRAIL
## 2 CHORUS OF VILLAGERS FROM (ARR. CLARINET ENS.) (ARR. Bellison) PRINCE IGOR
## 3 AIR FROM (ARR. CLARINET ENS.) (ARR. Bellison) SUITE IN D MAJOR
## 4 ANDANTE FROM (ARR. CLARINET ENS.) (ARR. Bellison) SURPRISE SYMPHONY
## 5 OVERTURE, DI BALLO
## 6 OVERTURE, DI BALLO
## 7 OVERTURE, DI BALLO
## 8 QUARTET, STRING, OP. 76, NO. 2, D MINOR, H.III:76, FIFTHS
## 9 CONCERT FANTASY ON FOR VIOLIN AND PIANO (ARR. Sarasate) CARMEN
## 10 QUARTET, STRING, OP. 76, NO. 2, D MINOR, H.III:76, FIFTHS
## 11 QUARTET, STRING, OP. 76, NO. 2, D MINOR, H.III:76, FIFTHS
## 12 QUARTET, STRING, OP. 76, NO. 2, D MINOR, H.III:76, FIFTHS
Eseguo l’operazione e ri-assegno a programsT.
programsT <- programsT %>%
mutate(workTitle = sapply(workTitle, mod_w))
Verifico la lunghezza degli elementi contenuti in movement.
programsT %>%
mutate(movement_L = sapply(movement, length)) %>%
count(movement_L)
## # A tibble: 3 × 2
## movement_L n
## <int> <int>
## 1 0 89759
## 2 1 35346
## 3 2 148
Vedo come sono quelli di lunghezza 2.
programsT %>%
mutate(movement_L = sapply(movement, length)) %>%
filter(movement_L == 2)
## # A tibble: 148 × 18
## id progr…¹ orche…² season conce…³ event…⁴ Locat…⁵ Venue Date Time ID
## <chr> <chr> <chr> <chr> <int> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 6a02e… 1907 New Yo… 1849-… 33 Subscr… Manhat… Apol… 1850… 8:00… 5801…
## 2 6a02e… 1907 New Yo… 1849-… 33 Subscr… Manhat… Apol… 1850… 8:00… 5801…
## 3 6f3bc… 11474 Musici… 1876-… 172 Special Manhat… Stei… 1876… 8:00… 6529…
## 4 d3eff… 871 New Yo… 1879-… 228 Subscr… Manhat… Acad… 1880… 8:00… 5801…
## 5 d3eff… 871 New Yo… 1879-… 228 Subscr… Manhat… Acad… 1880… 8:00… 5801…
## 6 d4145… 9023 New Yo… 1898-… 695 Young … Manhat… Carn… 1899… 2:30… 6529…
## 7 09a41… 10541 New Yo… 1903-… 908 Young … Manhat… Carn… 1904… None 6529…
## 8 59816… 10512 New Yo… 1905-… 987 Subscr… Manhat… Carn… 1905… 2:30… 1063…
## 9 604c7… 10440 New Yo… 1913-… 1688 Young … Manhat… Carn… 1914… None 1050…
## 10 604c7… 10440 New Yo… 1913-… 1688 Young … Manhat… Carn… 1914… None 1050…
## # … with 138 more rows, 7 more variables: composerName <chr>, workTitle <chr>,
## # conductorName <chr>, soloists <list>, movement <list>, interval <chr>,
## # movement_L <int>, and abbreviated variable names ¹programID, ²orchestra,
## # ³concertID, ⁴eventType, ⁵Location
Mi limito a concatenarli, con la funzione mod_m. Inoltre sostituisco i NULL con NA.
mod_m <- function(x) {
if (is.null(x)) return (NA)
if (length(x) == 1) return (x[[1]])
if (length(x) == 2) return (paste(x[[1]], "_", x[[2]]))
}
programsT <- programsT %>% mutate(movement = sapply(movement, mod_m))
Controllo la lunghezza delle liste contenute in soloists. Ho 89.321 potenziali righe di informazioni su solisti + 85.122 righe nulle.
programsT %>%
mutate(soloists_L = sapply(soloists, length)) %>%
count(soloists_L) %>%
mutate(soloists_N = soloists_L * n) %>%
mutate(soloists_T = cumsum(soloists_N)) %>%
filter(soloists_L == 0 | soloists_L == max(soloists_L))
## # A tibble: 2 × 4
## soloists_L n soloists_N soloists_T
## <int> <int> <int> <int>
## 1 0 85122 0 0
## 2 84 1 84 89321
Provo unnest_longer di soloists con opzione keep_empty per tenere anche le righe che contengono liste NULL e vedo che sono di lunghezza 0 o 3.
programsT %>%
unnest_longer(soloists, keep_empty = TRUE) %>%
mutate(soloists_L = sapply(soloists, length)) %>%
count(soloists_L)
## # A tibble: 2 × 2
## soloists_L n
## <int> <int>
## 1 0 85127
## 2 3 89316
Ce ne sono cinque che sono di lunghezza 1, ma poi NULL.
programsT %>%
mutate(soloists_L = sapply(soloists, length)) %>%
filter(soloists_L != 0) %>%
unnest_longer(soloists, keep_empty = TRUE) %>%
mutate(soloists_L2 = sapply(soloists, length)) %>%
filter(soloists_L2 == 0)
## # A tibble: 5 × 19
## id progr…¹ orche…² season conce…³ event…⁴ Locat…⁵ Venue Date Time ID
## <chr> <chr> <chr> <chr> <int> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 0afec4… 14489 Musici… 2019-… 22157 Virtua… Online… YouT… 2020… 7:40… 6979*
## 2 b64d6e… 14552 Musici… 2019-… 22165 Virtua… Online… YouT… 2020… 1:00… 5276…
## 3 23ee98… 14586 Musici… 2019-… 22171 Virtua… Online… Inst… 2020… 1:00… 1319…
## 4 70f296… 14477 Musici… 2019-… 22178 Virtua… Online… YouT… 2020… 1:20… 1320…
## 5 398d6d… 14470 Musici… 2019-… 22181 Virtua… Online… YouT… 2020… 10:4… 5007…
## # … with 8 more variables: composerName <chr>, workTitle <chr>,
## # conductorName <chr>, soloists <list>, movement <chr>, interval <chr>,
## # soloists_L <int>, soloists_L2 <int>, and abbreviated variable names
## # ¹programID, ²orchestra, ³concertID, ⁴eventType, ⁵Location
Per esempio.
programsT %>%
filter(programID == 14489) %>%
pull(soloists)
## [[1]]
## [[1]][[1]]
## NULL
Faccio quindi unnest_longer, poi unnest_wider, il numero di colonne passa da 17 a 19 (soloistName, soloistInstrument e soloistRoles al posto di soloists).
programsT <- programsT %>%
unnest_longer(soloists, keep_empty = TRUE) %>%
unnest_wider(soloists)
programsT
## # A tibble: 174,443 × 19
## id progr…¹ orche…² season conce…³ event…⁴ Locat…⁵ Venue Date Time ID
## <chr> <chr> <chr> <chr> <int> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 00646… 3853 New Yo… 1842-… 1 Subscr… Manhat… Apol… 1842… 8:00… 5244…
## 2 00646… 3853 New Yo… 1842-… 1 Subscr… Manhat… Apol… 1842… 8:00… 8834…
## 3 00646… 3853 New Yo… 1842-… 1 Subscr… Manhat… Apol… 1842… 8:00… 3642*
## 4 00646… 3853 New Yo… 1842-… 1 Subscr… Manhat… Apol… 1842… 8:00… 3642*
## 5 00646… 3853 New Yo… 1842-… 1 Subscr… Manhat… Apol… 1842… 8:00… 3642*
## 6 00646… 3853 New Yo… 1842-… 1 Subscr… Manhat… Apol… 1842… 8:00… 3642*
## 7 00646… 3853 New Yo… 1842-… 1 Subscr… Manhat… Apol… 1842… 8:00… 3642*
## 8 00646… 3853 New Yo… 1842-… 1 Subscr… Manhat… Apol… 1842… 8:00… 0*
## 9 00646… 3853 New Yo… 1842-… 1 Subscr… Manhat… Apol… 1842… 8:00… 8834…
## 10 00646… 3853 New Yo… 1842-… 1 Subscr… Manhat… Apol… 1842… 8:00… 8835…
## # … with 174,433 more rows, 8 more variables: composerName <chr>,
## # workTitle <chr>, conductorName <chr>, soloistName <chr>,
## # soloistInstrument <chr>, soloistRoles <chr>, movement <chr>,
## # interval <chr>, and abbreviated variable names ¹programID, ²orchestra,
## # ³concertID, ⁴eventType, ⁵Location
id rappresenta il GUID, che fa parte dell’indirizzo a cui è possibile vedere il programma online (archives.nyphil.org/index.php/artifact/GUID/fullview). ci sono tre programID che sono associati a più GUID. Tengo comunque la colonna id.
ids <- programsT %>%
distinct(id, programID) %>%
group_by(programID) %>%
count(sort = TRUE) %>%
filter(n>1) %>%
select(programID) %>%
ungroup()
semi_join(programsT, ids) %>%
distinct(id, programID) %>%
select(id, programID)
## Joining with `by = join_by(programID)`
## # A tibble: 8 × 2
## id programID
## <chr> <chr>
## 1 af869073-9643-4dab-b182-948e3f2e6ab9-0.1 8950
## 2 f4cf3522-7910-4e90-9978-ab70ac615ae4-0.1 8950
## 3 b31c88f2-1774-4f8b-94cf-835952c65175-0.1 8950
## 4 38283dce-9333-46c4-828a-54c0ad957c7b-0.1 10525
## 5 ee1431f7-6f25-4e5f-b815-4be9d743cf03-0.1 10525
## 6 8a73c52f-95a8-411c-b6e8-3d7bc4d7b90a-0.1 10525
## 7 f4fd303d-46c7-4233-b087-0d2f5f91cc7b-0.1 5358
## 8 b533775d-c639-4f79-9838-f88a88f79e95-0.1 5358
Correggo come è scritta la data, aggiungo anche componenti anno, mese, giorno della data.
#ci sono 5591 righe che contengono None invece dell'orario in formato AM o PM
programsT %>%
mutate(trovato = str_detect(programsT$Time, "\\d{1,2}:\\d{1,2}[P,A]M")) %>%
filter(trovato == FALSE)
## # A tibble: 5,591 × 20
## id progr…¹ orche…² season conce…³ event…⁴ Locat…⁵ Venue Date Time ID
## <chr> <chr> <chr> <chr> <int> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 09581… 305 New Yo… 1843-… 5 Subscr… Manhat… Apol… 1843… None 5245…
## 2 09581… 305 New Yo… 1843-… 5 Subscr… Manhat… Apol… 1843… None 3677…
## 3 09581… 305 New Yo… 1843-… 5 Subscr… Manhat… Apol… 1843… None 0*
## 4 09581… 305 New Yo… 1843-… 5 Subscr… Manhat… Apol… 1843… None 8955…
## 5 09581… 305 New Yo… 1843-… 5 Subscr… Manhat… Apol… 1843… None 5190…
## 6 09581… 305 New Yo… 1843-… 5 Subscr… Manhat… Apol… 1843… None 5072…
## 7 8025e… 4226 New Yo… 1843-… 7 Subscr… Manhat… Apol… 1844… None 3707*
## 8 8025e… 4226 New Yo… 1843-… 7 Subscr… Manhat… Apol… 1844… None 3712*
## 9 8025e… 4226 New Yo… 1843-… 7 Subscr… Manhat… Apol… 1844… None 3712*
## 10 8025e… 4226 New Yo… 1843-… 7 Subscr… Manhat… Apol… 1844… None 0*
## # … with 5,581 more rows, 9 more variables: composerName <chr>,
## # workTitle <chr>, conductorName <chr>, soloistName <chr>,
## # soloistInstrument <chr>, soloistRoles <chr>, movement <chr>,
## # interval <chr>, trovato <lgl>, and abbreviated variable names ¹programID,
## # ²orchestra, ³concertID, ⁴eventType, ⁵Location
# lascio Time com'è, sistemo la data, tenendo anche anno, mese, giorno
programsT <- programsT %>%
mutate(Date = str_extract(Date, "[[:digit:]-]+")) %>%
mutate(Date = ymd(Date)) %>%
mutate(Year = year(Date)) %>%
mutate(Month = month(Date)) %>%
mutate(Day = day(Date)) %>%
select(id, programID, orchestra, season, concertID, eventType, Location, Venue, Date, Year, Month, Day, Time, ID, composerName, workTitle, conductorName, soloistName, soloistInstrument, soloistRoles, movement, interval)
Divido ID in workID e movementID
programsT <- programsT %>%
separate(ID, into = c("workID", "movID"), sep = "\\*") %>%
mutate(workID = as.integer(workID)) %>%
mutate(movID = as.integer(movID))
Ci sono delle celle con valori multipli in conductorName.
programsT %>%
mutate(co_conductor = str_detect(conductorName, ";")) %>%
filter(co_conductor)
## # A tibble: 502 × 24
## id progr…¹ orche…² season conce…³ event…⁴ Locat…⁵ Venue Date Year
## <chr> <chr> <chr> <chr> <int> <chr> <chr> <chr> <date> <dbl>
## 1 1b7479… 10581 New Yo… 1847-… 24 Special Manhat… Cast… 1848-02-05 1848
## 2 1b7479… 10581 New Yo… 1847-… 24 Special Manhat… Cast… 1848-02-05 1848
## 3 1b7479… 10581 New Yo… 1847-… 24 Special Manhat… Cast… 1848-02-05 1848
## 4 1b7479… 10581 New Yo… 1847-… 24 Special Manhat… Cast… 1848-02-05 1848
## 5 1b7479… 10581 New Yo… 1847-… 24 Special Manhat… Cast… 1848-02-05 1848
## 6 1b7479… 10581 New Yo… 1847-… 24 Special Manhat… Cast… 1848-02-05 1848
## 7 1b7479… 10581 New Yo… 1847-… 24 Special Manhat… Cast… 1848-02-05 1848
## 8 1b7479… 10581 New Yo… 1847-… 24 Special Manhat… Cast… 1848-02-05 1848
## 9 1b7479… 10581 New Yo… 1847-… 24 Special Manhat… Cast… 1848-02-05 1848
## 10 1b7479… 10581 New Yo… 1847-… 24 Special Manhat… Cast… 1848-02-05 1848
## # … with 492 more rows, 14 more variables: Month <dbl>, Day <int>, Time <chr>,
## # workID <int>, movID <int>, composerName <chr>, workTitle <chr>,
## # conductorName <chr>, soloistName <chr>, soloistInstrument <chr>,
## # soloistRoles <chr>, movement <chr>, interval <chr>, co_conductor <lgl>, and
## # abbreviated variable names ¹programID, ²orchestra, ³concertID, ⁴eventType,
## # ⁵Location
Correggo un errore (un conductorName scritto come “de Waart, Edo; ; ; de Waart, Edo”).
programsT <- programsT %>%
mutate(conductorName = if_else(str_detect(conductorName, ";\\s*;\\s*;"), "de Waart, Edo", conductorName))
Correggo errori nei nomi dei compositori.
programsT <- programsT %>%
mutate(composerName = str_replace_all(composerName, "\\s+", " "))
Modifico aggiungendo una colonna che indica la presenza di co-conduzione.
programsT <- programsT %>%
mutate(conductorName = str_remove(conductorName, "^; ")) %>%
mutate(co_conductor = str_detect(conductorName, ";")) %>%
separate_longer_delim(conductorName, "; ")
Costruisco un tibble ridotto per le analisi successive.
performances <- programsT %>%
filter(is.na(interval)) %>%
distinct(orchestra,eventType,season,concertID, Date, composerName, workTitle, conductorName, co_conductor) %>%
mutate(year = year(Date)) %>%
mutate(seasonN = dense_rank(season)) %>%
group_by(seasonN) %>%
mutate(seasonYear = sapply(season,function (x) as.integer(str_extract(x, "[[:digit:]]+")))) %>%
ungroup()
remove(programsT)
Orchestre presenti nel database.
ggplot(performances) +
geom_bar(aes(y = orchestra))
Orchestre nel tempo.
ggplot(performances) +
geom_point(aes(seasonN, orchestra))
Andamento numero performance nelle stagioni.
performances %>%
group_by(seasonN) %>%
count() %>%
ggplot(mapping = aes(x = seasonN, y = n)) +
geom_point()
Che tipi di performance ci sono. Mi limito alle categorie che contengono più di 100 performance.
performances %>%
group_by(eventType) %>%
count() %>%
filter(n > 100) %>%
ggplot(aes(x = n, y = eventType)) +
geom_bar(stat = "identity")
performances %>%
group_by(eventType) %>%
filter(orchestra == "New York Philharmonic") %>%
count() %>%
filter(n > 100) %>%
ggplot(aes(x = n, y = eventType)) +
geom_bar(stat = "identity")
performances %>%
filter(orchestra == "New York Philharmonic") %>%
mutate(group = if_else(eventType == "Subscription Season", "SUB", "REST")) %>%
group_by(seasonYear,group) %>%
count() %>%
ggplot(aes(x = seasonYear, y = n, colour = group)) +
geom_point()
Quali sono gli eventType preponderanti dal 2000 in poi non di tipo Subscription Season?
performances %>%
filter(orchestra == "New York Philharmonic") %>%
filter(year >= 2000) %>%
filter(eventType != "Subscription Season") %>%
group_by(seasonYear,eventType) %>%
count() %>%
ggplot(aes(x = seasonYear, y = n, color = eventType)) +
geom_line(alpha = 1/2, show.legend = FALSE)
Fra gli eventi che non fanno parte della stagione regolare, quelli presenti con maggior continuità sono tour e concerti non-subscription.
performances %>%
filter(orchestra == "New York Philharmonic") %>%
filter(year >= 2000) %>%
filter(eventType != "Subscription Season") %>%
ggplot(aes(y=eventType)) +
geom_bar()
performances %>%
filter(orchestra == "New York Philharmonic") %>%
filter(year >= 2000) %>%
filter(eventType != "Subscription Season") %>%
group_by(seasonYear,eventType) %>%
count() %>%
group_by(seasonYear) %>%
mutate(seasonEvents = sum(n)) %>%
ungroup() %>%
mutate(perc = n / seasonEvents) %>%
group_by(seasonYear) %>%
filter(perc > 0.2) %>%
ungroup() %>%
ggplot() +
geom_point(aes(seasonYear, perc, color = eventType)) +
geom_line(aes(seasonYear, perc, color = eventType))
performances %>%
filter(orchestra == "New York Philharmonic") %>%
filter(year >= 2000) %>%
filter(eventType != "Subscription Season") %>%
group_by(seasonYear,eventType) %>%
count() %>%
group_by(seasonYear) %>%
mutate(seasonEvents = sum(n)) %>%
ungroup() %>%
mutate(perc = n / seasonEvents) %>%
group_by(seasonYear) %>%
filter(eventType == "Bandwagon")
## # A tibble: 1 × 5
## # Groups: seasonYear [1]
## seasonYear eventType n seasonEvents perc
## <int> <chr> <int> <int> <dbl>
## 1 2020 Bandwagon 47 131 0.359
Da ora in poi mi limito a considerare New York Philharmonic.
performances <- performances %>%
filter(orchestra == "New York Philharmonic")
Si possono trovare dei trend che ci permettano di dire qualcosa sulla popolarità di alcuni compositori nel corso del tempo? Qui sotto un grafico con stagioni - numero di performance per compositore.
pcp <- performances %>%
group_by(seasonYear, composerName) %>%
count() %>%
group_by(seasonYear) %>%
mutate(totalSeason = sum(n)) %>%
ungroup() %>%
mutate(perc = n / totalSeason)
saveRDS(pcp, file = "pcp.RDS")
ggplot(pcp,aes(seasonYear, n, color = composerName)) +
geom_line(alpha = 1/4, show.legend = FALSE)
Questo con le percentuali. Per ogni compositore, la percentuale di performance del compositore rispetto al totale della stagione.
ggplot(pcp,aes(seasonYear, perc, color = composerName)) +
geom_line(alpha = 1/4, show.legend = FALSE)
Costruisco un modello quadratico per tutti i compositori (dipendente percentuale, indipendenti stagione e stagione al quadrato). Tolgo i compositori che compaiono per meno di 11 stagioni.
pcp <- pcp %>%
mutate(seasonYear_2 = seasonYear**2)
tenorless <- performances %>%
group_by(composerName) %>%
summarise(seasons = n_distinct(seasonYear)) %>%
filter(seasons <= 10)
pcp_rid <- pcp %>%
anti_join(tenorless)
## Joining with `by = join_by(composerName)`
pcp_nested <- pcp_rid %>%
group_by(composerName) %>%
nest()
pcp_model <- function(df) {
lm(perc ~ seasonYear + seasonYear_2, data = df)
}
pcp_nested <- pcp_nested %>%
mutate(model = map(data, pcp_model))
pcp_nested <- pcp_nested %>%
mutate(data = map2(data, model, add_residuals))
pcp_nested <- pcp_nested %>%
mutate(glance = map(model, broom::glance))
pcp_models_simp <- pcp_nested %>%
mutate(model = map(model, tidy)) %>%
unnest_longer(model) %>%
unnest_wider(model) %>%
select(composerName,term,estimate) %>%
pivot_wider(names_from = term,values_from = estimate)
saveRDS(pcp_models_simp, file = "pcp_models_simp.RDS")
pcp_glance <- unnest(pcp_nested, glance, names_sep = "_")
saveRDS(pcp_glance, file = "pcp_glance.RDS")
pcp_resids <- unnest(pcp_nested, data)
saveRDS(pcp_resids, file = "pcp_resids.RDS")
Prendo un subset di compositori per poter filtrare in un secondo momento. Almeno 80 stagioni, con densità 70 %. Sono 25 compositori, responsabili per il 57 % delle performance.
over80 <- performances %>%
group_by(composerName) %>%
mutate(
seasons = n_distinct(seasonYear),
first = min(seasonYear),
last = max(seasonYear),
span = last - first + 1,
density = seasons / span) %>%
filter(seasons > 80 & density > 0.70) %>%
distinct(composerName)
performances %>%
mutate (totali = n()) %>%
inner_join(over80) %>%
mutate (over80 = n(), perc = over80 / totali) %>%
distinct(perc)
## Joining with `by = join_by(composerName)`
## # A tibble: 1 × 1
## perc
## <dbl>
## 1 0.574
Simile, più di 40 stagioni, con densità del 70%.
over40 <- performances %>%
group_by(composerName) %>%
mutate(
seasons = n_distinct(seasonYear),
first = min(seasonYear),
last = max(seasonYear),
span = last - first + 1,
density = seasons / span) %>%
filter(seasons >= 40 & density > 0.70) %>%
distinct(composerName)
performances %>%
mutate (totali = n()) %>%
inner_join(over40) %>%
mutate (over40 = n(), perc = over40 / totali) %>%
distinct(perc)
## Joining with `by = join_by(composerName)`
## # A tibble: 1 × 1
## perc
## <dbl>
## 1 0.639
Un altro subset, compositori rappresentati prima della stagione 1909, per almeno 10 stagioni. Rappresentano il 43 % delle performance totali.
before_1909 <- performances %>%
group_by(composerName) %>%
filter(seasonYear < 1909) %>%
mutate(seasons = n_distinct(seasonYear)) %>%
filter(seasons > 10) %>%
distinct(composerName)
saveRDS(before_1909, file = "before_1909")
performances %>%
mutate (totali = n()) %>%
inner_join(before_1909) %>%
mutate (before_1909 = n(), perc = before_1909 / totali) %>%
distinct(perc)
## Joining with `by = join_by(composerName)`
## # A tibble: 1 × 1
## perc
## <dbl>
## 1 0.439
Modelli in cui tutte e due le componenti sono positive. Non ci sono.
up_up <- pcp_models_simp %>%
filter(seasonYear > 0, seasonYear_2 > 0) %>%
select(composerName) %>%
head(20)
Modelli in cui tutte e due le componenti sono negative. Non ci sono.
down_down <- pcp_models_simp %>%
filter(seasonYear<0,seasonYear_2<0) %>%
select(composerName) %>%
head(20)
Modelli in cui la componente quadratica è positiva e la componente lineare è negativa. Primi 20 per componente lineare.
down_up <- pcp_models_simp %>%
filter(seasonYear<0,seasonYear_2>0) %>%
arrange(seasonYear) %>%
select(composerName) %>%
head(20)
pcp %>%
semi_join(down_up) %>%
ggplot(aes(x = seasonYear, y = perc)) +
geom_point(alpha = 1/4) +
facet_wrap(~composerName)
## Joining with `by = join_by(composerName)`
Modelli in cui la componente quadratica è negativa e in cui la componente lineare è positiva. Filtrati per componente lineare più grande.
up_down <- pcp_models_simp %>%
filter(seasonYear>0,seasonYear_2<0) %>%
arrange(-seasonYear) %>%
select(composerName) %>%
head(20)
pcp %>%
semi_join(up_down) %>%
ggplot(aes(x = seasonYear, y = perc)) +
geom_point(alpha = 1/4) +
facet_wrap(~composerName)
## Joining with `by = join_by(composerName)`
Residui.
pcp_resids %>%
ggplot(aes(resid)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Plott dei residui nel tempo.
ggplot(pcp_resids) +
geom_point(aes(seasonYear, resid), alpha = 1/4, show.legend = FALSE)
Guardo ai compositori che hanno residui più lontani di 0.1 da 0.
highres <- pcp_resids %>%
filter(resid > 0.1 | resid < -0.1) %>%
distinct(composerName)
pcp %>%
semi_join(highres) %>%
ggplot() +
geom_point(aes(x = seasonYear, y = perc),alpha = 1/4) +
facet_wrap(~composerName)
## Joining with `by = join_by(composerName)`
Modelli che fittano abbastanza bene.
pcp %>%
semi_join(
pcp_glance %>%
filter(glance_r.squared > 0.6) %>%
select(composerName)
) %>%
ggplot() +
geom_point(aes(x = seasonYear, y = perc),alpha = 1/4) +
facet_wrap(~composerName)
## Joining with `by = join_by(composerName)`
Solo gli over 80.
ggplot(
pcp %>%
inner_join(over80)
) +
geom_point(aes(x = seasonYear, y = perc),alpha = 1/4) +
facet_wrap(~composerName)
## Joining with `by = join_by(composerName)`
Gini compositori.
library(ineq)
gini_comp <- pcp %>%
select(seasonYear,n) %>%
group_by(seasonYear) %>%
mutate(gini = Gini(n)) %>%
distinct(seasonYear,gini)
ggplot(gini_comp) +
geom_vline(aes(xintercept = 1909, color = "red"), show.legend = FALSE) +
geom_line(aes(seasonYear,gini))
Quelli che sono stati rappresentati prima della stagione 1909-1910 per almeno dieci stagioni.
ggplot(
pcp %>%
inner_join(before_1909)
) +
geom_point(aes(x = seasonYear, y = perc),alpha = 1/4) +
geom_vline(aes(xintercept = 1909, color = "red"), show.legend = FALSE) +
facet_wrap(~composerName)
## Joining with `by = join_by(composerName)`
Numero di compositori per stagione.
performances %>%
group_by(seasonYear) %>%
mutate(composers = n_distinct(composerName)) %>%
ungroup() %>%
ggplot() +
geom_point(aes(seasonYear,composers)) +
geom_vline(aes(xintercept = 1909, color = "red", show.legend = FALSE))
## Warning in geom_vline(aes(xintercept = 1909, color = "red", show.legend =
## FALSE)): Ignoring unknown aesthetics: show.legend
Stagioni che hanno Gini compositori maggiore di 0.5.
pcp %>%
inner_join(
gini_comp %>%
filter(gini > 0.5)
) %>%
ggplot(aes(seasonYear,perc,color=composerName)) +
geom_point(show.legend = FALSE)
## Joining with `by = join_by(seasonYear)`
Stagioni che hanno Gini compositori maggiore di 0.5. Compositori che nella stagione hanno più dell’10% delle performance.
pcp %>%
inner_join(
gini_comp %>%
filter(gini > 0.5)
) %>%
filter(perc > 0.1) %>%
ggplot(aes(seasonYear,perc,color=composerName)) +
geom_point(show.legend = TRUE)
## Joining with `by = join_by(seasonYear)`
Mediana percentuali per stagione.
pcp %>%
group_by(seasonYear) %>%
mutate(median = median(perc)) %>%
ggplot() +
geom_line(aes(seasonYear,median)) +
geom_vline(aes(xintercept = 1909, color = "red")) +
annotate("text", x = 1917, y = 0.06, label="1909", angle=0)
Si possono individuare nei trend all’interno dell’evoluzione dei repertori dei conduttori?
ccn2 <- performances %>%
filter(!is.na(conductorName)) %>%
filter(conductorName != "Not conducted") %>%
group_by(seasonYear,conductorName,composerName) %>%
mutate(comp_cond_season = n()) %>%
group_by(seasonYear,conductorName) %>%
mutate(cond_season = n()) %>%
mutate(perc = comp_cond_season / cond_season) %>%
group_by(conductorName,composerName) %>%
mutate(seasonsCC = n_distinct(seasonYear)) %>%
filter(seasonsCC > 5) %>%
group_by(conductorName) %>%
mutate(seasonsC = n_distinct(seasonYear)) %>%
filter(seasonsC > 5) %>%
mutate(seasonYear_2 = seasonYear**2) %>%
ungroup()
saveRDS(ccn2, file = "ccn2.RDS")
ccn2_nested <- ccn2 %>%
group_by(conductorName,composerName) %>%
nest()
ccn2_func <- function(df) {
lm(perc ~ seasonYear + seasonYear_2, data = df)
}
regs <- ccn2_nested %>%
mutate(
model = map(data, ccn2_func),
tidied = map(model, tidy),
glanced = map(model, glance),
augmented = map(model, augment)
)
saveRDS(regs, file = "regs.RDS")
Modelli molto buoni e “buoni”.
very_good <- regs %>%
unnest(glanced) %>%
filter(r.squared >= 0.95) %>%
select(conductorName, composerName)
good <- regs %>%
unnest(glanced) %>%
filter(r.squared >= 0.7) %>%
select(conductorName, composerName)
Come sono distribuiti i residui.
regs %>%
unnest(augmented) %>%
select(.resid) %>%
ggplot() +
geom_histogram(aes(.resid))
## Adding missing grouping variables: `conductorName`, `composerName`
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Guardo a quelli più alti di 0.2
highres <- regs %>%
unnest(augmented) %>%
filter(.resid > 0.2 | .resid < -0.2) %>%
distinct(conductorName,composerName,seasonYear,.resid)
Direttori-compositori con fit molto buono.
ccn2 %>%
inner_join(very_good) %>%
ggplot() +
geom_line(aes(seasonYear,perc,color=composerName), show.legend = FALSE) +
geom_point(aes(seasonYear,perc,color=composerName), show.legend = FALSE) +
facet_wrap(~conductorName)
## Joining with `by = join_by(composerName, conductorName)`
Bernstein, compositori con fit molto buono.
ccn2 %>%
filter(conductorName == "Bernstein, Leonard") %>%
inner_join(very_good) %>%
ggplot() +
geom_line(aes(seasonYear,perc,color=composerName), show.legend = FALSE) +
geom_point(aes(seasonYear,perc,color=composerName), show.legend = FALSE) +
facet_wrap(~composerName)
## Joining with `by = join_by(composerName, conductorName)`
Bernstein, compositori che non sono nel fit molto buono.
ccn2 %>%
filter(conductorName == "Bernstein, Leonard") %>%
anti_join(very_good) %>%
ggplot() +
geom_line(aes(seasonYear,perc,color=composerName), show.legend = FALSE) +
geom_point(aes(seasonYear,perc,color=composerName), show.legend = FALSE) +
facet_wrap(~composerName)
## Joining with `by = join_by(composerName, conductorName)`